Takehome exam
Load packages
library(Seurat)## Registered S3 method overwritten by 'spatstat.core':
## method from
## formula.glmmPQL MASS
## Attaching SeuratObject
## Attaching sp
library(dplyr)##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(Matrix)
library(ggplot2)
library(grid)
library(tidyr)##
## Attaching package: 'tidyr'
## The following objects are masked from 'package:Matrix':
##
## expand, pack, unpack
library(ggdendro)
library(SCINA)## Loading required package: MASS
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
## Loading required package: gplots
##
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
##
## lowess
library(SingleR)## Loading required package: SummarizedExperiment
## Loading required package: MatrixGenerics
## Loading required package: matrixStats
##
## Attaching package: 'matrixStats'
## The following object is masked from 'package:dplyr':
##
## count
##
## Attaching package: 'MatrixGenerics'
## The following objects are masked from 'package:matrixStats':
##
## colAlls, colAnyNAs, colAnys, colAvgsPerRowSet, colCollapse,
## colCounts, colCummaxs, colCummins, colCumprods, colCumsums,
## colDiffs, colIQRDiffs, colIQRs, colLogSumExps, colMadDiffs,
## colMads, colMaxs, colMeans2, colMedians, colMins, colOrderStats,
## colProds, colQuantiles, colRanges, colRanks, colSdDiffs, colSds,
## colSums2, colTabulates, colVarDiffs, colVars, colWeightedMads,
## colWeightedMeans, colWeightedMedians, colWeightedSds,
## colWeightedVars, rowAlls, rowAnyNAs, rowAnys, rowAvgsPerColSet,
## rowCollapse, rowCounts, rowCummaxs, rowCummins, rowCumprods,
## rowCumsums, rowDiffs, rowIQRDiffs, rowIQRs, rowLogSumExps,
## rowMadDiffs, rowMads, rowMaxs, rowMeans2, rowMedians, rowMins,
## rowOrderStats, rowProds, rowQuantiles, rowRanges, rowRanks,
## rowSdDiffs, rowSds, rowSums2, rowTabulates, rowVarDiffs, rowVars,
## rowWeightedMads, rowWeightedMeans, rowWeightedMedians,
## rowWeightedSds, rowWeightedVars
## Loading required package: GenomicRanges
## Loading required package: stats4
## Loading required package: BiocGenerics
##
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:dplyr':
##
## combine, intersect, setdiff, union
## The following objects are masked from 'package:stats':
##
## IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
##
## anyDuplicated, append, as.data.frame, basename, cbind, colnames,
## dirname, do.call, duplicated, eval, evalq, Filter, Find, get, grep,
## grepl, intersect, is.unsorted, lapply, Map, mapply, match, mget,
## order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
## rbind, Reduce, rownames, sapply, setdiff, sort, table, tapply,
## union, unique, unsplit, which.max, which.min
## Loading required package: S4Vectors
##
## Attaching package: 'S4Vectors'
## The following object is masked from 'package:gplots':
##
## space
## The following object is masked from 'package:tidyr':
##
## expand
## The following objects are masked from 'package:Matrix':
##
## expand, unname
## The following objects are masked from 'package:dplyr':
##
## first, rename
## The following objects are masked from 'package:base':
##
## expand.grid, I, unname
## Loading required package: IRanges
##
## Attaching package: 'IRanges'
## The following objects are masked from 'package:dplyr':
##
## collapse, desc, slice
## The following object is masked from 'package:sp':
##
## %over%
## Loading required package: GenomeInfoDb
## Loading required package: Biobase
## Welcome to Bioconductor
##
## Vignettes contain introductory material; view with
## 'browseVignettes()'. To cite Bioconductor, see
## 'citation("Biobase")', and for packages 'citation("pkgname")'.
##
## Attaching package: 'Biobase'
## The following object is masked from 'package:MatrixGenerics':
##
## rowMedians
## The following objects are masked from 'package:matrixStats':
##
## anyMissing, rowMedians
##
## Attaching package: 'SummarizedExperiment'
## The following object is masked from 'package:SeuratObject':
##
## Assays
## The following object is masked from 'package:Seurat':
##
## Assays
library(tidyverse)## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✔ tibble 3.1.6 ✔ stringr 1.4.0
## ✔ readr 2.1.2 ✔ forcats 0.5.1
## ✔ purrr 0.3.4
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ IRanges::collapse() masks dplyr::collapse()
## ✖ Biobase::combine() masks BiocGenerics::combine(), dplyr::combine()
## ✖ matrixStats::count() masks dplyr::count()
## ✖ IRanges::desc() masks dplyr::desc()
## ✖ S4Vectors::expand() masks tidyr::expand(), Matrix::expand()
## ✖ dplyr::filter() masks stats::filter()
## ✖ S4Vectors::first() masks dplyr::first()
## ✖ dplyr::lag() masks stats::lag()
## ✖ tidyr::pack() masks Matrix::pack()
## ✖ BiocGenerics::Position() masks ggplot2::Position(), base::Position()
## ✖ purrr::reduce() masks GenomicRanges::reduce(), IRanges::reduce()
## ✖ S4Vectors::rename() masks dplyr::rename()
## ✖ MASS::select() masks dplyr::select()
## ✖ IRanges::slice() masks dplyr::slice()
## ✖ tidyr::unpack() masks Matrix::unpack()
Functions
0.1 Generate read matrix of each donors
processMatrix <- function(mat, genes, samples) {
mat <- t(mat)
rownames(mat) <- genes
#samples <- gsub("_","-",samples)
colnames(mat) <- samples
return(mat)
}1 Data
The raw data used for this take-home exam is the data processed in the term project. The SRR ids for each cell sample were obtained via the command:
esearch -db sra -query PRJNA438394 | efetch –format runinfo | cut -d “,” -f 1| grep SRR
FastQs were downloaded through Sratoolkit, then the data of each sample was processed by the following pipeline: - TrimGalore and FastQC (on both fastq files if paired-end data) -> Trimming FastQs - STAR -> alignments from FastQs - R-SEM -> counts from BAMS
The R-SEM raw count and TPM count data for each gene and sample of each donor are merged into a single matrix. Due to issues with the fastq files, 42 samples from donor-40 and 1 sample from donor-54 were removed from the analysis. The data received would be assessed according to the quality control thresholds: minimum read mapping percentage in excess of 95% and maximum percentage of reads lost to trimming below 5%.
2 Generate Seruat object
2.1 Generate matrix for each donor
# column names for datasets
donor37_colnames <- read.table("all_donor37_names.tsv")$V1
donor38_colnames <- read.table("all_donor38_names.tsv")$V1
donor39_colnames <- read.table("all_donor39_names.tsv")$V1
# 43 samples removed due to internal fastq issues
donor40_colnames <- read.table("all_donor40_withval.tsv")$V1
donor53_colnames <- read.table("all_donor53_names.tsv")$V1
# removes the paired end / possibly broken 059 sample
donor54_colnames <- read.table("scRNA_Donor54rm_samples_to_SRR.tsv")$V1
gene_rownames <- read.table("89_SCT10_Donor-37_CD103pos_TIL_N715_S513.genes.results",header=TRUE)$gene_id2.1.1 donor37
donor <- "donor37"
donor37_matrix <- read.csv("Donor-37_RAW_matrix.tsv",sep="\t",header=FALSE)
donor37_matrix$V27044 <- NULL
donor37_matrix <- processMatrix(donor37_matrix,gene_rownames,donor37_colnames)2.1.2 donor38
donor <- "donor38"
donor38_matrix <- read.csv("Donor-38_RAW_matrix.tsv",sep="\t",header=FALSE)
donor38_matrix$V27044 <- NULL
donor38_matrix <- processMatrix(donor38_matrix,gene_rownames,donor38_colnames)2.1.3 donor39
donor <- "donor39"
donor39_matrix <- read.csv("Donor-39_RAW_matrix.tsv",sep="\t",header=FALSE)
donor39_matrix$V27044 <- NULL
donor39_matrix <- processMatrix(donor39_matrix,gene_rownames,donor39_colnames)2.1.4 donor40
donor <- "donor40"
donor40_matrix <- read.csv("Donor-40_RAW_matrix.tsv",sep="\t",header=FALSE)
donor40_matrix$V27044 <- NULL
donor40_matrix <- processMatrix(donor40_matrix,gene_rownames,donor40_colnames)2.1.5 donor53
donor <- "donor53"
donor53_matrix <- read.csv("Donor-53_RAW_matrix.tsv",sep="\t",header=FALSE)
donor53_matrix$V27044 <- NULL
donor53_matrix <- processMatrix(donor53_matrix,gene_rownames,donor53_colnames)2.1.6 donor 54
donor <- "donor54"
donor54_matrix <- read.csv("Donor-54_RAW_matrix.tsv",sep="\t",header=FALSE)
donor54_matrix$V27044 <- NULL
donor54_matrix <- processMatrix(donor54_matrix,gene_rownames,donor54_colnames)2.2 Combind the matrix to single matrix
# original dataset from PRJ
orig_mat <- read.table("GSE111894_Clarke_SC_raw_counts.txt")
orig_mat2 <- read.table("GSE111894_Clarke_SC_raw_counts_upd.txt")
orig_matrix <- cbind(orig_mat,orig_mat2)
rm(orig_mat,orig_mat2)
orig_gene_count <- dim(orig_matrix)[1]
print(paste0("Number of genes in original: ", dim(orig_matrix)[1]))## [1] "Number of genes in original: 23368"
our_total_matrix <- cbind(donor37_matrix, donor38_matrix, donor39_matrix,
donor40_matrix, donor53_matrix, donor54_matrix)
total_names <- colnames(our_total_matrix)
total_names <- sub("-","_", total_names)
total_names <- gsub("_","-", total_names)
# original dataset matrix to SeuratObject
orig_names <- colnames(orig_matrix)
orig_names <- sub("X","",orig_names)
orig_names <- sub("\\.","-",orig_names)
orig_names <- gsub("_","-",orig_names)
colnames(orig_matrix) <- orig_names
orig_matrix_sub <- orig_matrix[,orig_names %in% total_names]
orig_matrix_sub <- orig_matrix_sub[rownames(orig_matrix_sub) %in% gene_rownames, ]
print(paste0("New Number of genes: ", dim(orig_matrix_sub)[1]))## [1] "New Number of genes: 21221"
our_total_matrix <- our_total_matrix[, total_names %in% colnames(orig_matrix_sub)]
our_total_matrix <- our_total_matrix[rownames(our_total_matrix) %in% rownames(orig_matrix_sub),]2.3 Generate Seurate subject
The matrices from each donor is generated as a Seurat object, and the merged matrix is also generated as a seurat object
donor37_obj <- CreateSeuratObject(counts = donor37_matrix, project = "subset", assay = "RNA")## Warning: Feature names cannot have underscores ('_'), replacing with dashes
## ('-')
donor38_obj <- CreateSeuratObject(counts = donor38_matrix, project = "subset", assay = "RNA")## Warning: Feature names cannot have underscores ('_'), replacing with dashes
## ('-')
donor39_obj <- CreateSeuratObject(counts = donor39_matrix, project = "subset", assay = "RNA")## Warning: Feature names cannot have underscores ('_'), replacing with dashes
## ('-')
donor40_obj <- CreateSeuratObject(counts = donor40_matrix, project = "subset", assay = "RNA")## Warning: Feature names cannot have underscores ('_'), replacing with dashes
## ('-')
donor53_obj <- CreateSeuratObject(counts = donor53_matrix, project = "subset", assay = "RNA")## Warning: Feature names cannot have underscores ('_'), replacing with dashes
## ('-')
donor54_obj <- CreateSeuratObject(counts = donor54_matrix, project = "subset", assay = "RNA")## Warning: Feature names cannot have underscores ('_'), replacing with dashes
## ('-')
our_total_obj <- CreateSeuratObject(counts = our_total_matrix, project = "subset", assay = "RNA")
our_total_obj## An object of class Seurat
## 21221 features across 1042 samples within 1 assay
## Active assay: RNA (21221 features, 0 variable features)
The seurat object for total data our_total_obj contains 1042 cell samples, each sample contains 21221 genes
##VlnPlot
VlnPlot(our_total_obj, features = c("nFeature_RNA", "nCount_RNA"), ncol = 3)FeatureScatter(our_total_obj, feature1 = "nCount_RNA", feature2 = "nFeature_RNA")Data with nFeature_RNA less than 200 and greater than 2500 were filtered out
our_total_obj <- subset(our_total_obj, subset = nFeature_RNA > 200 & nFeature_RNA < 2500)
VlnPlot(our_total_obj, features = c("nFeature_RNA", "nCount_RNA"), ncol = 3)As the result, 369 samples were filted out from the seruat subject, 673 samples are remained in the data, with 21221 genes for each sample.
2.4 Normalizing
After removing unwanted cells from the dataset, the data would be normalized.
our_total_obj <- NormalizeData(our_total_obj, normalization.method = "LogNormalize", scale.factor = 10000)2.5 Identification of highly variable features
our_total_obj <- FindVariableFeatures(our_total_obj, selection.method = "vst", nfeatures = 5000)
# Identify the 10 most highly variable genes
top10 <- head(VariableFeatures(our_total_obj), 10)
# plot variable features with and without labels
plot1 <- VariableFeaturePlot(our_total_obj)
plot2 <- LabelPoints(plot = plot1, points = top10, repel = TRUE)## When using repel, set xnudge and ynudge to 0 for optimal results
plot1## Warning: Transformation introduced infinite values in continuous x-axis
## Warning: Removed 5958 rows containing missing values (geom_point).
plot2## Warning: Transformation introduced infinite values in continuous x-axis
## Removed 5958 rows containing missing values (geom_point).
The top10 genes with highest standardized variance are shown in the figure.
2.6 Scaling the data
all.genes <- rownames(our_total_obj)
our_total_obj <- ScaleData(our_total_obj, features = all.genes)## Centering and scaling data matrix
2.7 Linear dimensional reduction
PCA is performed on the scaled data.
our_total_obj <- RunPCA(our_total_obj, features = VariableFeatures(object = our_total_obj))## PC_ 1
## Positive: MTRNR2L8, MTRNR2L2, MALAT1, LDB1, RPPH1, RNF213, CWF19L2, NBPF10, LOC100190986, ANKRD11
## ZXDA, KCNQ1OT1, PLCG2, HIST1H2BN, HIST1H2AM, RERE, ZNF136, ACVRL1, ZBTB7A, CHD3
## RNF144A, HNRNPUL2, KIAA1109, WASH3P, RPS27, TAT, HIST1H2AH, LOH12CR2, MTOR, PLEC
## Negative: ACTB, GAPDH, COTL1, DDX5, YWHAZ, RAC2, CD74, CORO1A, UBC, HSPA8
## UCP2, PGK1, ACTR3, EIF4G2, PGAM1, LDHA, PRKCH, HLA-DRB1, HLA-DPB1, TPI1
## TAP1, MSN, HLA-DRA, H3F3B, RGS1, HLA-DRB5, EIF4A1, GZMA, EEF1A1, CALM3
## PC_ 2
## Positive: LTB, IER2, CD69, JUNB, RPL32, H3F3B, CLDND1, ALOX5AP, ANXA1, DUSP1
## RPS18, RPL39, IL7R, RPL41, TSC22D3, RPL27A, ROMO1, FOS, RPLP0, DNAJB1
## RPS19, GPR183, HINT1, RPS3A, MAGED2, RPS27, PNKD, MRPL14, RPS17, MRPS26
## Negative: HAVCR2, LYST, MTRNR2L2, HLA-DRB5, CTLA4, CD38, MTRNR2L8, TNFRSF9, XIST, GBP5
## TIGIT, GBP2, CCL3, GBP4, WARS, HLA-DRA, HLA-DRB1, RDH10, RNF213, STAT1
## SLA, TOX, PRRC2C, FCRL3, HAPLN3, IRF4, NR4A2, IFNG, NEAT1, CD74
## PC_ 3
## Positive: CXCR6, RDH10, TRIM22, GBP5, GBP4, MVP, XAF1, RBM39, HAVCR2, TNFRSF9
## KIR2DL4, APOL6, WARS, CLK1, GZMB, ATRX, GIMAP5, CD244, ZBED2, DDX5
## GBP2, TNFRSF1B, TIGIT, ACSL4, CARD16, HAPLN3, SNX14, IFI6, OASL, CCL3
## Negative: TYMS, SKA3, TOP2A, PKMYT1, APOBEC3B, ZWINT, CCNB2, MELK, CCNA2, PLK1
## NUSAP1, STMN1, CDK1, TROAP, CEP152, PRC1, TCF19, SHCBP1, TUBB, FEN1
## MKI67, NCAPD3, GTSE1, TUBA1B, MYBL2, GGH, HMGB2, SPAG5, NUF2, FBXO5
## PC_ 4
## Positive: MX1, GZMA, IFIT1, IFI44L, OASL, CMPK2, IFIT3, HLA-DRB1, USP18, TNFRSF18
## OSTM1, IFI6, ZNF354A, ZFYVE21, HSPA1A, TBC1D10C, ADD1, GZMH, CD74, RBPJ
## TNFSF10, COX20, TREX1, MLYCD, RGS16, NUP160, INPP1, SYNGR2, ALOX5AP, CAPN2
## Negative: FOS, JUNB, NR4A2, NFKBIA, TNFAIP3, CXCR4, TSC22D3, KLF6, DUSP1, TAGAP
## FAM46C, ZNF331, DUSP2, CD55, S1PR1, CD69, RPS3A, IL7R, RPL13A, RPL3
## EEF1A1, FAM65B, PABPC1, CD4, RPL27A, CSRNP1, RPLP0, GPR183, TXK, SRSF7
## PC_ 5
## Positive: GZMB, GNLY, RGS1, CCL4, TNFAIP3, SLC27A2, IFIT1, KIR2DL4, IFI44L, CD69
## USP18, DNAJB1, MX1, SAT1, CCL4L2, GZMH, RDH10, XYLT2, IFI6, CWF19L1
## HSH2D, HPSE, FAM46C, CLEC2D, IFIT3, HSPA5, IPO13, KRT86, RGS2, CDCA7L
## Negative: EEF1A1, ICA1, DAPP1, TPT1, CCDC71L, TNFRSF4, CCR7, FAF1, CD4, RPS2
## MAP7D3, PTP4A2, RNF20, EEF1G, CHST7, ITM2A, CHID1, SGPP2, HLA-DRB5, TXLNG
## RPS23, IGFBP4, RPL31, PRDX2, TTC27, RPL4, DPP9, FOXP3, RPL5, ALKBH4
print(our_total_obj[["pca"]], dims = 1:5, nfeature = 5)## PC_ 1
## Positive: MTRNR2L8, MTRNR2L2, MALAT1, LDB1, RPPH1
## Negative: ACTB, GAPDH, COTL1, DDX5, YWHAZ
## PC_ 2
## Positive: LTB, IER2, CD69, JUNB, RPL32
## Negative: HAVCR2, LYST, MTRNR2L2, HLA-DRB5, CTLA4
## PC_ 3
## Positive: CXCR6, RDH10, TRIM22, GBP5, GBP4
## Negative: TYMS, SKA3, TOP2A, PKMYT1, APOBEC3B
## PC_ 4
## Positive: MX1, GZMA, IFIT1, IFI44L, OASL
## Negative: FOS, JUNB, NR4A2, NFKBIA, TNFAIP3
## PC_ 5
## Positive: GZMB, GNLY, RGS1, CCL4, TNFAIP3
## Negative: EEF1A1, ICA1, DAPP1, TPT1, CCDC71L
VizDimLoadings(our_total_obj, dims = 1:2, reduction = "pca")DimPlot(our_total_obj, reduction = "pca")DimHeatmap(our_total_obj, dims = 1:10, cells = 1000, balanced = TRUE)## Warning: Requested number is larger than the number of available items (673).
## Setting to 673.
## Warning: Requested number is larger than the number of available items (673).
## Setting to 673.
## Warning: Requested number is larger than the number of available items (673).
## Setting to 673.
## Warning: Requested number is larger than the number of available items (673).
## Setting to 673.
## Warning: Requested number is larger than the number of available items (673).
## Setting to 673.
## Warning: Requested number is larger than the number of available items (673).
## Setting to 673.
## Warning: Requested number is larger than the number of available items (673).
## Setting to 673.
## Warning: Requested number is larger than the number of available items (673).
## Setting to 673.
## Warning: Requested number is larger than the number of available items (673).
## Setting to 673.
## Warning: Requested number is larger than the number of available items (673).
## Setting to 673.
2.8 Determine the “dimensionality” of dataset
our_total_obj <- JackStraw(our_total_obj, num.replicate = 100)
our_total_obj <- ScoreJackStraw(our_total_obj, dims = 1:20)JackStrawPlot(our_total_obj, dims = 1:15)## Warning: Removed 61290 rows containing missing values (geom_point).
ElbowPlot(our_total_obj)2.9 Cluster the cells
The cells in the seruat object are clustered into groups.
our_total_obj <- FindNeighbors(our_total_obj, dims = 1:15)## Computing nearest neighbor graph
## Computing SNN
our_total_obj <- FindClusters(our_total_obj, resolution = 0.5)## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
##
## Number of nodes: 673
## Number of edges: 27658
##
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.7684
## Number of communities: 6
## Elapsed time: 0 seconds
our_total_obj <- RunUMAP(our_total_obj, dims = 1:10)## Warning: The default method for RunUMAP has changed from calling Python UMAP via reticulate to the R-native UWOT using the cosine metric
## To use Python UMAP via reticulate, set umap.method to 'umap-learn' and metric to 'correlation'
## This message will be shown once per session
## 05:02:11 UMAP embedding parameters a = 0.9922 b = 1.112
## 05:02:11 Read 673 rows and found 10 numeric columns
## 05:02:11 Using Annoy for neighbor search, n_neighbors = 30
## 05:02:11 Building Annoy index with metric = cosine, n_trees = 50
## 0% 10 20 30 40 50 60 70 80 90 100%
## [----|----|----|----|----|----|----|----|----|----|
## **************************************************|
## 05:02:11 Writing NN index file to temp file /var/folders/tf/3cxd6sr54lq_t847lh8s_b080000gn/T//RtmpCocCgM/file10dd326aac5cc
## 05:02:11 Searching Annoy index using 1 thread, search_k = 3000
## 05:02:11 Annoy recall = 100%
## 05:02:11 Commencing smooth kNN distance calibration using 1 thread
## 05:02:12 Initializing from normalized Laplacian + noise
## 05:02:12 Commencing optimization for 500 epochs, with 25866 positive edges
## 05:02:13 Optimization finished
The 673 gene samples are clustered into 6 groups.
DimPlot(our_total_obj, reduction = "umap")our_total_obj <- RunTSNE(our_total_obj,dims=1:10)
tsneplot <- TSNEPlot(our_total_obj,label=TRUE,
pt.size=1.5)+NoLegend()
tsneplotThe biomarkers for every cluster compared to all remaining cells are identified
obj.markers <- FindAllMarkers(our_total_obj, only.pos = TRUE, min.pct = 0.25, logfc.threshold = 0.25)## Calculating cluster 0
## Calculating cluster 1
## Calculating cluster 2
## Calculating cluster 3
## Calculating cluster 4
## Calculating cluster 5
The most significant biomarker for each groups are shown below.
obj.markers %>%
group_by(cluster) %>%
slice_max(n = 5, order_by = avg_log2FC)## # A tibble: 30 × 7
## # Groups: cluster [6]
## p_val avg_log2FC pct.1 pct.2 p_val_adj cluster gene
## <dbl> <dbl> <dbl> <dbl> <dbl> <fct> <chr>
## 1 3.29e-57 2.12 0.9 0.286 6.99e-53 0 ZNF683
## 2 5.25e-57 2.08 0.791 0.184 1.11e-52 0 CAPG
## 3 1.04e-12 1.86 0.255 0.065 2.21e- 8 0 HSPA6
## 4 7.37e-20 1.44 0.816 0.535 1.56e-15 0 HSPA1A
## 5 8.43e-37 1.41 0.703 0.205 1.79e-32 0 ITM2C
## 6 1.19e-21 2.26 0.48 0.157 2.53e-17 1 TNFRSF9
## 7 3.37e-39 2.11 0.8 0.349 7.15e-35 1 HLA-DRB5
## 8 1.96e-30 1.87 0.583 0.159 4.16e-26 1 CTLA4
## 9 2.62e-22 1.86 0.611 0.275 5.57e-18 1 HAVCR2
## 10 7.00e-30 1.77 0.989 0.867 1.49e-25 1 LAG3
## # … with 20 more rows
The expression heatmap for given cells and features are generated, and the top5 markers for each cluster are shown in the map.
obj.markers %>%
group_by(cluster) %>%
top_n(n = 5, wt = avg_log2FC) -> top5
DoHeatmap(our_total_obj, features = top5$gene) + NoLegend()3 Analysis
3.1 Pathway enrichment in epithelial cells
Canoonical markers for identification is used to identify the Epithelial cells. CAPS, TMEM190, PIFO, and SNTN are used as biomarker for Epithelial cells, and EPCAM is used as biomarker for cancer.
VlnPlot(our_total_obj, features = c("CAPS", "TMEM190", "PIFO", "SNTN", "EPCAM"))## Warning in FetchData.Seurat(object = object, vars = features, slot = slot): The
## following requested variables were not found: PIFO
## Warning in SingleExIPlot(type = type, data = data[, x, drop = FALSE], idents =
## idents, : All cells have the same value of TMEM190.
The result shows that only a tiny fraction of cells express the genes the study is interested in. The cell with biomarkers for epithelial cells and cancer cells are splited into different clusters.
FeaturePlot(our_total_obj, features = c("CAPS", "TMEM190", "PIFO", "SNTN", "EPCAM"))## Warning in FetchData.Seurat(object = object, vars = c(dims, "ident",
## features), : The following requested variables were not found: PIFO
## Warning in FeaturePlot(our_total_obj, features = c("CAPS", "TMEM190", "PIFO", :
## All cells have the same value (0) of TMEM190.
Since the flow cytometry was performed to isolate the Cytotoxic T cell (CTLs) for single-cell sequencing (smartseq2), not many cells other than CTLs are expected to be found in this data.
SCINA is used to identify the Epithelial cells and cancer cells in the total dataset. Since TRM is the main research target of the data source article, more than half of the cells are TRM, so the biomarker ITGAE of TRM is also included
as.data.frame(our_total_obj@assays$RNA[,]) -> scina.data
load(system.file('extdata','example_signatures.RData', package = "SCINA"))
signatures <- list("Epithelial cells" = c("CAPS", "TMEM190", "PIFO", "SNTN"),
"CD103" = c("ITGAE"),
"Cancer" = c("EPCAM", "EGFR"))SCINA(
scina.data,
signatures,
max_iter = 100,
convergence_n = 10,
convergence_rate = 0.999,
sensitivity_cutoff = 0.9,
rm_overlap=TRUE,
allow_unknown=TRUE
) -> scina.results
our_total_obj$scina_labels <- scina.results$cell_labels
DimPlot(our_total_obj,reduction = "tsne", pt.size = 1, label = TRUE, group.by = "scina_labels", label.size = 5)DimPlot(our_total_obj, reduction = "umap", split.by = "scina_labels")epithelial <- subset(our_total_obj, subset = scina_labels %in% "Epithelial cells")Only 5 cells are classified as epithelial cells in lung cancer. Pathway enrichment is performed on the 5 cells.
library(cerebroApp)
hallmark_gene_gmt <- system.file("h.all.v7.5.1.symbols.gmt.txt",package = "cerebroApp")
# gives the enrichment of our trm and non-trm cells in the apoptotic pathway - from MSIGDB
apopt_gsea <- performGeneSetEnrichmentAnalysis(our_total_obj, assay="RNA",GMT_file="h.all.v7.5.1.symbols.gmt.txt", groups = 'seurat_clusters')## [05:02:29] Loading gene sets...
## [05:02:29] Loaded 50 gene sets from GMT file.
## [05:02:29] Extracting transcript counts from `data` slot of `RNA` assay...
## [05:02:29] Performing analysis for 6 subgroups of group `seurat_clusters`...
## Estimating GSVA scores for 50 gene sets.
## Estimating ECDFs with Gaussian kernels
##
|
| | 0%
|
|= | 2%
|
|=== | 4%
|
|==== | 6%
|
|====== | 8%
|
|======= | 10%
|
|======== | 12%
|
|========== | 14%
|
|=========== | 16%
|
|============= | 18%
|
|============== | 20%
|
|=============== | 22%
|
|================= | 24%
|
|================== | 26%
|
|==================== | 28%
|
|===================== | 30%
|
|====================== | 32%
|
|======================== | 34%
|
|========================= | 36%
|
|=========================== | 38%
|
|============================ | 40%
|
|============================= | 42%
|
|=============================== | 44%
|
|================================ | 46%
|
|================================== | 48%
|
|=================================== | 50%
|
|==================================== | 52%
|
|====================================== | 54%
|
|======================================= | 56%
|
|========================================= | 58%
|
|========================================== | 60%
|
|=========================================== | 62%
|
|============================================= | 64%
|
|============================================== | 66%
|
|================================================ | 68%
|
|================================================= | 70%
|
|================================================== | 72%
|
|==================================================== | 74%
|
|===================================================== | 76%
|
|======================================================= | 78%
|
|======================================================== | 80%
|
|========================================================= | 82%
|
|=========================================================== | 84%
|
|============================================================ | 86%
|
|============================================================== | 88%
|
|=============================================================== | 90%
|
|================================================================ | 92%
|
|================================================================== | 94%
|
|=================================================================== | 96%
|
|===================================================================== | 98%
|
|======================================================================| 100%
## [05:02:30] 0 gene sets passed the thresholds across all subgroups of group `seurat_clusters`.
The sample size is too small to perform GSEA (it didn’t pass the thresholds across all subgroups)
3.2 Smoke vs. non-smoke
According to the supplementary table, Donors 37, donor38, donor 39, and donor 40 have known information on whether they smoke or not. Donor 37, donor38, and donor 39 are former smokers, and donor 40 never smoked before.
all_col_names <- colnames(our_total_obj)
Donors <- paste("D",
sapply(strsplit(sapply(strsplit(all_col_names, split = "_D"), "[[", 2), split = "_"), "[[", 1), sep = "")our_total_obj$donor <- Donorssmoke <- subset(our_total_obj, subset = donor %in% c("Donor-37", "Donor-38", "Donor-39", "Donor-40"))
identity_vector = c()
identity_vector2 = c()
donors <- colnames(smoke)
count = 1
cluster_vector <- Idents(smoke)
for (col_val in donors){
if (grepl("donor-40", tolower(col_val))) {
identity_vector <- append(identity_vector, paste0("Never",cluster_vector[count]))
identity_vector2 <- append(identity_vector2, "Never")
} else {
identity_vector <- append(identity_vector, paste("Ex",cluster_vector[count]))
identity_vector2 <- append(identity_vector2, "Ex")
}
count <- count + 1
}
smoke$orig.ident <- identity_vector
smoke$whether_smoke <- identity_vector2
Hmisc::describe(smoke$whether_smoke)## smoke$whether_smoke
## n missing distinct
## 347 0 2
##
## Value Ex Never
## Frequency 240 107
## Proportion 0.692 0.308
DimPlot(smoke, reduction = "umap", group.by = "whether_smoke")DimPlot(smoke, reduction = "umap", split.by = "whether_smoke")Donor 37, donor39, and donor 40 are diagnosed with the same type of cancer (Lung Adenocarcinoma), donor 38 is diagnosed with Lung Squamous. To eliminate bias, a subset of the Seurat object is generated, which only contains information for those three donors with the same kind of cancer.
smoke <- subset(our_total_obj, subset = donor %in% c("Donor-37", "Donor-39", "Donor-40"))
identity_vector = c()
identity_vector2 = c()
donors <- colnames(smoke)
count = 1
cluster_vector <- Idents(smoke)
for (col_val in donors){
if (grepl("donor-40", tolower(col_val))) {
identity_vector <- append(identity_vector, paste0("Never",cluster_vector[count]))
identity_vector2 <- append(identity_vector2, "Never")
} else {
identity_vector <- append(identity_vector, paste("Ex",cluster_vector[count]))
identity_vector2 <- append(identity_vector2, "Ex")
}
count <- count + 1
}
smoke$orig.ident <- identity_vector
smoke$whether_smoke <- identity_vector2
Hmisc::describe(smoke$whether_smoke)## smoke$whether_smoke
## n missing distinct
## 265 0 2
##
## Value Ex Never
## Frequency 158 107
## Proportion 0.596 0.404
DimPlot(smoke, reduction = "umap", group.by = "whether_smoke")DimPlot(smoke, reduction = "umap", split.by = "whether_smoke")By comparing the umap between the “Ex” group and the “Never” group, cells in cluster 2 show little in the never-smoking group; meanwhile, cluster 3 is relatively denser in the never-smoking group.
The biomarkers for every cluster compared to all remaining cells in smoke are identified
smoke.marker <- FindAllMarkers(smoke, only.pos = TRUE, min.pct = 0.25, logfc.threshold = 0.25)## Calculating cluster 0
## Calculating cluster 1
## Calculating cluster 2
## Calculating cluster 3
The most significant biomarker for each groups are shown below.
smoke.marker %>%
group_by(cluster) %>%
slice_max(n = 10, order_by = avg_log2FC)## # A tibble: 30 × 7
## # Groups: cluster [3]
## p_val avg_log2FC pct.1 pct.2 p_val_adj cluster gene
## <dbl> <dbl> <dbl> <dbl> <dbl> <fct> <chr>
## 1 4.34e-13 2.13 0.839 0.352 0.00000000921 0 CAPG
## 2 2.63e-10 2.07 0.701 0.222 0.00000557 0 OASL
## 3 9.85e- 9 1.98 0.706 0.333 0.000209 0 PDIA6
## 4 4.83e- 6 1.95 0.512 0.185 0.102 0 HLA-DRA
## 5 1.02e-10 1.94 0.791 0.352 0.00000216 0 APOBEC3C
## 6 1.34e-11 1.88 0.744 0.296 0.000000285 0 CDC42
## 7 1.31e- 5 1.84 0.483 0.185 0.278 0 MAPRE2
## 8 4.81e- 4 1.82 0.493 0.278 1 0 TAF7
## 9 1.05e- 5 1.82 0.493 0.204 0.222 0 SCAMP2
## 10 8.00e- 7 1.81 0.517 0.185 0.0170 0 SAMSN1
## # … with 20 more rows
The expression heatmap for given cells and features are generated, and the top5 markers for each cluster are shown in the map.
smoke.marker %>%
group_by(cluster) %>%
top_n(n = 10, wt = avg_log2FC) -> top5
DoHeatmap(smoke, features = top5$gene) + NoLegend()library(cerebroApp)
smoke_gsea <- performGeneSetEnrichmentAnalysis(smoke, assay="RNA",GMT_file="h.all.v7.5.1.symbols.gmt.txt", groups = 'whether_smoke')## [05:02:35] Loading gene sets...
## [05:02:35] Loaded 50 gene sets from GMT file.
## [05:02:35] Extracting transcript counts from `data` slot of `RNA` assay...
## [05:02:35] Performing analysis for 2 subgroups of group `whether_smoke`...
## Estimating GSVA scores for 50 gene sets.
## Estimating ECDFs with Gaussian kernels
##
|
| | 0%
|
|= | 2%
|
|=== | 4%
|
|==== | 6%
|
|====== | 8%
|
|======= | 10%
|
|======== | 12%
|
|========== | 14%
|
|=========== | 16%
|
|============= | 18%
|
|============== | 20%
|
|=============== | 22%
|
|================= | 24%
|
|================== | 26%
|
|==================== | 28%
|
|===================== | 30%
|
|====================== | 32%
|
|======================== | 34%
|
|========================= | 36%
|
|=========================== | 38%
|
|============================ | 40%
|
|============================= | 42%
|
|=============================== | 44%
|
|================================ | 46%
|
|================================== | 48%
|
|=================================== | 50%
|
|==================================== | 52%
|
|====================================== | 54%
|
|======================================= | 56%
|
|========================================= | 58%
|
|========================================== | 60%
|
|=========================================== | 62%
|
|============================================= | 64%
|
|============================================== | 66%
|
|================================================ | 68%
|
|================================================= | 70%
|
|================================================== | 72%
|
|==================================================== | 74%
|
|===================================================== | 76%
|
|======================================================= | 78%
|
|======================================================== | 80%
|
|========================================================= | 82%
|
|=========================================================== | 84%
|
|============================================================ | 86%
|
|============================================================== | 88%
|
|=============================================================== | 90%
|
|================================================================ | 92%
|
|================================================================== | 94%
|
|=================================================================== | 96%
|
|===================================================================== | 98%
|
|======================================================================| 100%
## [05:02:35] 1 gene sets passed the thresholds across all subgroups of group `whether_smoke`.
smoke_gsea@misc$enriched_pathways## $cerebro_GSVA
## $cerebro_GSVA$whether_smoke
## # A tibble: 1 × 8
## whether_smoke name description length genes enrichment_score p_value q_value
## <fct> <chr> <chr> <int> <chr> <dbl> <dbl> <dbl>
## 1 Ex HALLM… http://www… 36 "VCA… -0.403 2.22e-4 0.0373
3.3 IL4 and IL7 pathway genes
FeaturePlot(our_total_obj, features = c("IL4", "IL17A","IL17F", "IL17B"))## Warning in FeaturePlot(our_total_obj, features = c("IL4", "IL17A", "IL17F", :
## All cells have the same value (0) of IL4.
## Warning in FeaturePlot(our_total_obj, features = c("IL4", "IL17A", "IL17F", :
## All cells have the same value (0) of IL17F.
## Warning in FeaturePlot(our_total_obj, features = c("IL4", "IL17A", "IL17F", :
## All cells have the same value (0) of IL17B.
Too low expression results in the inability of the crosstalk to generate the signaling pathway of these two genes.
The signaling pathways for IL4 and IL17 identified in the thesis are used to create a feature plot. IL7R, IRF2, and GTF3A are the genes in the IL4 signaling pathway. FADD, CASP3, CASP8, RELA, IL17RA, IL17RB, and TRADD are the genes in the IL17 signaling pathway. The result shows that those genes are distributed in each cluster.
FeaturePlot(our_total_obj, features = c("IL7R", "IRF2", "GTF3A"))FeaturePlot(our_total_obj, features = c("FADD", "CASP3", "CASP8", "RELA", "IL17RA", "IL17RB", "TRADD"))4 Conclusion
Since the flow cytometry was performed to isolate the Cytotoxic T cell (CTLs) for single-cell sequencing. The types of cells are limited, so studies on the other cells, including lung epithelial cells, cannot be reproduced with this data. The symptoms of COPD in the donors are not introduced in the supplement table. A data set more suitable for research goals should be used to find the link between COPD and lung cancer.
5 Session information
xfun::session_info()## R version 4.1.2 (2021-11-01)
## Platform: x86_64-apple-darwin17.0 (64-bit)
## Running under: macOS Big Sur 10.16
##
## Locale: en_US.UTF-8 / en_US.UTF-8 / en_US.UTF-8 / C / en_US.UTF-8 / en_US.UTF-8
##
## Package version:
## abind_1.4-5 annotate_1.72.0
## AnnotationDbi_1.56.2 ape_5.6-2
## askpass_1.1 assertthat_0.2.1
## babelgene_22.3 backports_1.4.1
## base64enc_0.1-3 beachmat_2.8.1
## BH_1.78.0.0 Biobase_2.54.0
## BiocFileCache_2.0.0 BiocGenerics_0.40.0
## BiocNeighbors_1.10.0 BiocParallel_1.28.3
## BiocSingular_1.8.1 biomaRt_2.48.3
## Biostrings_2.62.0 bit_4.0.4
## bit64_4.0.5 bitops_1.0-7
## blob_1.2.3 bookdown_0.26
## broom_0.8.0 bslib_0.3.1
## cachem_1.0.6 callr_3.7.0
## caTools_1.18.2 cellranger_1.1.0
## cerebroApp_1.3.1 checkmate_2.1.0
## cli_3.3.0 clipr_0.8.0
## cluster_2.1.3 codetools_0.2-18
## colorspace_2.0-3 colourpicker_1.1.1
## commonmark_1.8.0 compiler_4.1.2
## cowplot_1.1.1 cpp11_0.4.2
## crayon_1.5.1 crosstalk_1.2.0
## curl_4.3.2 data.table_1.14.2
## DBI_1.1.2 dbplyr_2.1.1
## DelayedArray_0.20.0 DelayedMatrixStats_1.14.3
## deldir_1.0-6 digest_0.6.29
## dplyr_1.0.9 dqrng_0.3.0
## DT_0.22 dtplyr_1.2.1
## ellipsis_0.3.2 evaluate_0.15
## fansi_1.0.3 farver_2.1.0
## fastmap_1.1.0 filelock_1.0.2
## fitdistrplus_1.1-8 FNN_1.1.3
## fontawesome_0.2.2 forcats_0.5.1
## foreign_0.8-82 formatR_1.12
## Formula_1.2-4 fs_1.5.2
## futile.logger_1.4.3 futile.options_1.0.1
## future_1.25.0 future.apply_1.9.0
## gargle_1.2.0 generics_0.1.2
## GenomeInfoDb_1.30.1 GenomeInfoDbData_1.2.7
## GenomicRanges_1.46.1 ggdendro_0.1.23
## ggplot2_3.3.5 ggrepel_0.9.1
## ggridges_0.5.3 globals_0.14.0
## glue_1.6.2 goftest_1.2-3
## googledrive_2.0.0 googlesheets4_1.0.0
## gplots_3.1.3 graph_1.70.0
## graphics_4.1.2 grDevices_4.1.2
## grid_4.1.2 gridExtra_2.3
## GSEABase_1.54.0 GSVA_1.40.1
## gtable_0.3.0 gtools_3.9.2
## haven_2.5.0 HDF5Array_1.20.0
## here_1.0.1 highr_0.9
## Hmisc_4.7-0 hms_1.1.1
## htmlTable_2.4.0 htmltools_0.5.2
## htmlwidgets_1.5.4 httpuv_1.6.5
## httr_1.4.2 ica_1.0-2
## ids_1.0.1 igraph_1.3.1
## IRanges_2.28.0 irlba_2.3.5
## isoband_0.2.5 jpeg_0.1-9
## jquerylib_0.1.4 jsonlite_1.8.0
## KEGGREST_1.34.0 KernSmooth_2.23-20
## knitr_1.39 labeling_0.4.2
## lambda.r_1.2.4 later_1.3.0
## lattice_0.20-45 latticeExtra_0.6-29
## lazyeval_0.2.2 leiden_0.3.10
## lifecycle_1.0.1 limma_3.50.1
## listenv_0.8.0 lmtest_0.9-40
## lubridate_1.8.0 magrittr_2.0.3
## MASS_7.3-57 Matrix_1.4-1
## MatrixGenerics_1.6.0 matrixStats_0.62.0
## memoise_2.0.1 methods_4.1.2
## mgcv_1.8-40 mime_0.12
## miniUI_0.1.1.1 modelr_0.1.8
## msigdbr_7.5.1 munsell_0.5.0
## nlme_3.1-157 nnet_7.3-17
## openssl_2.0.0 parallel_4.1.2
## parallelly_1.31.1 patchwork_1.1.1
## pbapply_1.5-0 pillar_1.7.0
## pkgconfig_2.0.3 plogr_0.2.0
## plotly_4.10.0 plyr_1.8.7
## png_0.1-7 polyclip_1.10-0
## prettyunits_1.1.1 processx_3.5.3
## progress_1.2.2 progressr_0.10.0
## promises_1.2.0.1 ps_1.7.0
## purrr_0.3.4 qvalue_2.24.0
## R6_2.5.1 RANN_2.6.1
## rappdirs_0.3.3 RColorBrewer_1.1-3
## Rcpp_1.0.8.3 RcppAnnoy_0.0.19
## RcppArmadillo_0.11.0.0.0 RcppEigen_0.3.3.9.2
## RcppHNSW_0.3.0 RcppProgress_0.4.2
## RcppTOML_0.1.7 RCurl_1.98-1.6
## readr_2.1.2 readxl_1.4.0
## rematch_1.0.1 rematch2_2.1.2
## reprex_2.0.1 reshape2_1.4.4
## reticulate_1.24 rgeos_0.5-9
## rhdf5_2.36.0 rhdf5filters_1.4.0
## Rhdf5lib_1.14.2 rlang_1.0.2
## rmarkdown_2.14 rmdformats_1.0.3
## ROCR_1.0-11 rpart_4.1.16
## rprojroot_2.0.3 RSpectra_0.16-1
## RSQLite_2.2.13 rstudioapi_0.13
## rsvd_1.0.5 Rtsne_0.16
## rvest_1.0.2 S4Vectors_0.32.3
## sass_0.4.1 ScaledMatrix_1.0.0
## scales_1.2.0 scattermore_0.8
## SCINA_1.2.0 sctransform_0.3.3
## selectr_0.4.2 Seurat_4.1.1
## SeuratObject_4.1.0 shiny_1.7.1
## shinycssloaders_1.0.0 shinydashboard_0.7.2
## shinyFiles_0.9.1 shinyjs_2.1.0
## shinyWidgets_0.6.4 SingleCellExperiment_1.14.1
## SingleR_1.6.1 sitmo_2.0.2
## snow_0.4.4 sourcetools_0.1.7
## sp_1.4-7 sparseMatrixStats_1.4.2
## spatstat.core_2.4-2 spatstat.data_2.2-0
## spatstat.geom_2.4-0 spatstat.random_2.2-0
## spatstat.sparse_2.1-1 spatstat.utils_2.3-0
## splines_4.1.2 stats_4.1.2
## stats4_4.1.2 stringi_1.7.6
## stringr_1.4.0 SummarizedExperiment_1.24.0
## survival_3.3-1 sys_3.4
## tensor_1.5 tibble_3.1.6
## tidyr_1.2.0 tidyselect_1.1.2
## tidyverse_1.3.1 tinytex_0.38
## tools_4.1.2 tzdb_0.3.0
## utf8_1.2.2 utils_4.1.2
## uuid_1.1.0 uwot_0.1.11
## vctrs_0.4.1 viridis_0.6.2
## viridisLite_0.4.0 vroom_1.5.7
## withr_2.5.0 xfun_0.30
## XML_3.99-0.9 xml2_1.3.3
## xtable_1.8-4 XVector_0.34.0
## yaml_2.3.5 zlibbioc_1.40.0
## zoo_1.8-10